library(readxl)
library(dplyr)
library(stringr)
library(purrr)
library(lubridate)
library(data.table)
library(plm)
library(lmtest)
library(sandwich)
library(ggplot2)

#####################
## Data Processing ##
#####################
# vdem <- read.csv("V-Dem-CY-Full+Others-v14.csv")
navco <- read_excel("navco3-0full.xlsx")
navco$COWcode <- navco$cowcode
navco <- navco %>% mutate(year = as.numeric(format(date, "%Y")))

navco <- navco %>%
  mutate(
    elocal = localities %>%
      str_to_lower() %>%
      str_replace_all(" ", "") %>%
      str_replace_all("[^a-z0-9]", "") %>%
      str_trim()
  )

vdem.sel <- vdem %>% 
  dplyr::select(country_id, country_name, year, COWcode, v2xel_locelec, 
                v2x_polyarchy, e_gdppc, e_pop) 

navco <- left_join(navco, vdem.sel, by = c("COWcode", "year"))

navco <- navco %>%
  mutate(
    num_partic_clean = num_partic_event %>%
      str_to_lower() %>%
      str_replace_all(",", "") %>%
      str_replace_all(" to ", "-") %>%
      str_replace_all("[–—−]", "-") %>%
      str_replace_all("[^0-9\\-]", "") %>%
      str_replace_all("-{2,}", "-") %>%
      str_remove_all("^-|-$"),
    num_partic_numeric = case_when(
      is.na(num_partic_clean) | num_partic_clean %in% c("", "na", ".") ~ NA_real_,
      str_detect(num_partic_clean, "^\\d+$") ~ as.numeric(num_partic_clean),
      str_detect(num_partic_clean, "^\\d+-\\d+$") ~ map_dbl(num_partic_clean, ~mean(as.numeric(str_split(.x, "-", simplify = TRUE)))),
      TRUE ~ NA_real_
    ),
    fatal_casu_clean = fatal_casu %>%
      str_to_lower() %>%
      str_replace_all(",", "") %>%
      str_replace_all(" to ", "-") %>%
      str_replace_all("[–—−]", "-") %>%
      str_replace_all("[^0-9\\-]", "") %>%
      str_replace_all("-{2,}", "-") %>%
      str_remove_all("^-|-$"),
    fatal_casu_numeric = case_when(
      is.na(fatal_casu_clean) | fatal_casu_clean %in% c("", "na", ".") ~ NA_real_,
      str_detect(fatal_casu_clean, "^\\d+$") ~ as.numeric(fatal_casu_clean),
      str_detect(fatal_casu_clean, "^\\d+-\\d+$") ~ map_dbl(fatal_casu_clean, ~mean(as.numeric(str_split(.x, "-", simplify = TRUE)))),
      TRUE ~ NA_real_
    ),
    fatalities = fatal_casu_numeric,
    participants = num_partic_numeric,
    event_date = as.Date(date)
  )

navco <- navco %>% arrange(elocal, event_date)

setDT(navco)
setorder(navco, elocal, event_date)

navco <- navco %>%
  group_by(elocal) %>%
  mutate(cumulated_fatal_weighted = map_dbl(row_number(), function(i) {
    if (i > 1) {
      days_diff <- as.numeric(difftime(event_date[i], event_date[1:(i - 1)], units = "days"))
      weights <- 1 - (days_diff / max(days_diff))
      weighted.mean(fatalities[1:(i - 1)], w = weights, na.rm = TRUE)
    } else {
      NA_real_
    }
  })) %>%
  ungroup()

# Define decay parameter
half_life <- 730
lambda <- log(2) / half_life

# Create exponential decay weighted average
navco <- navco %>%
  arrange(elocal, event_date) %>%
  group_by(elocal) %>%
  mutate(cumulated_fatal_weighted_exp = map_dbl(row_number(), function(i) {
    if (i > 1) {
      days_diff <- as.numeric(difftime(event_date[i], event_date[1:(i - 1)], units = "days"))
      weights_exp <- exp(-lambda * days_diff)
      weighted.mean(fatalities[1:(i - 1)], w = weights_exp, na.rm = TRUE)
    } else {
      NA_real_
    }
  })) %>%
  ungroup()

navco_meaned <- navco %>%
  group_by(elocal, event_date) %>%
  dplyr::summarize(
    participants_mean = mean(participants, na.rm = TRUE),
    fatalities = mean(fatalities, na.rm = TRUE),
    v2x_polyarchy = mean(v2x_polyarchy, na.rm = TRUE),
    gdppc = mean(e_gdppc, na.rm = TRUE),
    pop = mean(e_pop, na.rm = TRUE),
    cumulated_fatal_weighted = mean(cumulated_fatal_weighted, na.rm = TRUE),
    cumulated_fatal_weighted_exp = mean(cumulated_fatal_weighted_exp, na.rm = TRUE),  
    v2xel_locelec = mean(v2xel_locelec, na.rm = TRUE),
    .groups = "drop"
  )

navco_final <- navco_meaned %>%
  arrange(elocal, event_date) %>%
  group_by(elocal) %>%
  mutate(
    fatality_diff_weighted = fatalities - cumulated_fatal_weighted,
    fatality_diff_weighted_exp = fatalities - dplyr::lag(cumulated_fatal_weighted_exp), 
    participants_lead = dplyr::lead(participants_mean)
  ) %>%
  ungroup() %>%
  mutate(
    fatality_diff_signed_weighted = sign(fatality_diff_weighted) * log1p(abs(fatality_diff_weighted)),
    fatality_diff_signed_weighted_exp = sign(fatality_diff_weighted_exp) * log1p(abs(fatality_diff_weighted_exp)),
    participants_lead = log1p(participants_lead)
  )

navco_final <- navco_final %>%
  filter(!is.na(elocal), !is.na(event_date))

navco_final$elocal <- factor(navco_final$elocal)

navco_final <- navco_final %>% 
  mutate(fatalities_log = log1p(fatalities)) %>%
  mutate(gdppc = log1p(gdppc)) %>% 
  mutate(pop = log1p(pop)) 

pdata <- pdata.frame(navco_final, index = c("elocal", "event_date"))

write.csv(pdata, "navco_local_final.csv", row.names = FALSE)
